library(vcfR)
## Warning: package 'vcfR' was built under R version 3.5.2
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.9.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
library(ggplot2)
library(adegenet)
## Loading required package: ade4
## Warning: package 'ade4' was built under R version 3.5.2
## 
##    /// adegenet 2.1.3 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()
library(StAMPP)
## Warning: package 'StAMPP' was built under R version 3.5.2
## Loading required package: pegas
## Warning: package 'pegas' was built under R version 3.5.2
## Loading required package: ape
## Warning: package 'ape' was built under R version 3.5.2
## 
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
## 
##     mst
## The following object is masked from 'package:ade4':
## 
##     amova
## The following object is masked from 'package:vcfR':
## 
##     getINFO
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
gg_color_hue<- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

#prepare for delimitr
#read in vcf
vcfR<- read.vcfR("~/Desktop/aph.data/unlinked.filtered.recode.vcf")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 2725
##   column count: 104
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 2725
##   Character matrix gt cols: 104
##   skip: 0
##   nrows: 2725
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant: 2725
## All variants processed
vcfR@gt<-vcfR@gt[,c(1,21,48:96)]
#write.table(cbind(colnames(vcfR@gt)[-1],
#      c(rep(3, times=19),5,rep(3, times=8),rep(4, times=4),rep(1, times=6),rep(2, times=8),10,10,
#        rep(9, times=5),10,10,10,rep(5, times=17),rep(6, times=7),7,7,8,8,8,8,7,8,8,7,7,7,8,8,7)),
#      "~/Downloads/pops.txt", row.names = F, quote = F, col.names = F)
#write.table(cbind(colnames(vcfR@gt)[-1],
#      c("intwood","sumi","sumi", rep("sumi", times=5),"sumi","sumi","sumi",rep("intwood", times=17),rep("texas", times=7),rep("mex", times=15))),
#      "~/Downloads/pops.txt", row.names = F, quote = F, col.names = F)
#write.vcf(vcfR, "~/Downloads/woodhouse.unlinked.filtered.vcf.gz")
#make traits file
int<-c()
for (i in 1:28){
  int[i]<-paste0("int",i)
}
sumi<-c()
for (i in 1:16){
  sumi[i]<-paste0("sumi",i)
}
tex<-c()
for (i in 1:10){
  tex[i]<-paste0("tex",i)
}
mex<-c()
for (i in 1:26){
  mex[i]<-paste0("mex",i)
}

d<-data.frame(traits=c(int,sumi,tex,mex),
           species=c(rep(0, times=28),rep(1, times=16),rep(2, times=10),rep(3, times=26)))
#write.table(d,"~/Downloads/scrub.traits.txt", row.names = F, quote = F, col.names = T)
#       fla: 33-38,
#       isl: 39-46,
#       cal: 1-19 21-28,
#       calbaja: 29-32,
#       intwood: 20 57-73,
#       texas: 74-80,
#       southmex: 81 82 87 90-92 95,
#       northmex: 83-86 88 89 93 94,
#       remota: 49-53,
#       sumi: 47 48 54-56;
#convert to genlight
vcfR<- read.vcfR("~/Desktop/aph.data/unlinked.filtered.recode.vcf")
## Scanning file to determine attributes.
## File attributes:
##   meta lines: 14
##   header_line: 15
##   variant count: 2725
##   column count: 104
## 
Meta line 14 read in.
## All meta lines processed.
## gt matrix initialized.
## Character matrix gt created.
##   Character matrix gt rows: 2725
##   Character matrix gt cols: 104
##   skip: 0
##   nrows: 2725
##   row_num: 0
## 
Processed variant 1000
Processed variant 2000
Processed variant: 2725
## All variants processed
gen<- vcfR2genlight(vcfR)
#reorder gen to match species assignments
gen<-gen[c(33:46,1:19,21:32,57:73,81:95,20,74:80,47:56),]

#read in locality info for samples
locs<-read.csv("~/Desktop/aph.data/rad.sampling.csv")
#subset locs to include only samples that passed filtering, and have been retained in the vcf
locs<-locs[locs$id %in% gen@ind.names,]
#order by species assingments
locs<-locs[c(33:46,1:19,21:32,57:73,81:95,20,74:80,47:56),]
rownames(locs)<-1:95
#combine lat/longs for nearby sampling localities 
locs[20,c(2,3)]<-locs[21,c(2,3)]
locs[34,c(2,3)]<-locs[37,c(2,3)]
locs[43,c(2,3)]<-locs[44,c(2,3)]
locs[79,c(2,3)]<-locs[80,c(2,3)]
locs[70,c(2,3)]<-locs[68,c(2,3)]
locs[71,c(2,3)]<-locs[68,c(2,3)]
locs[74,c(2,3)]<-locs[64,c(2,3)]

locs$species<-as.character(locs$species)
locs[c(79:85),4]<-c(rep("texana", times=7))
locs$species<-as.factor(locs$species)

#add sampling locality to locs df
locs$number<-c(rep(26, times=6),rep(11, times=8),rep(10, times=5),rep(8, times=4),rep(7, times=2),
               rep(9, times=3),rep(6, times=5),4,5,5,4,4,rep(3, times=3),2,1,1,2,14,14,13,13,13,
               12,12,12,15,15,18,17,17,17,16,16,16,23,23,20,21,21,21,22,21,21,22,22,23,20,20,22,
               18,rep(19, times=7),25,25,rep(24, times=5),25,25,25)
loc.frame<-locs[,c(1,10)]

table(locs$species)
## 
##  californica coerulescens    insularis  sumichrasti       texana  woodhouseii 
##           31            6            8           10            7           33
table(locs$decimallatitude)
## 
##      16.942          17       21.45      21.851        23.5 24.78333333 
##           5           5           3           4           2           2 
##          25       26.48      27.187  29.8382615  30.5924103          32 
##           5           3           6           7           2           3 
##      33.269      34.024      34.344  34.6467247  34.9369969      35.858 
##           3           8           2           3           2           5 
##  36.2499844     37.2873      37.802        37.9  40.3697149  40.5959797 
##           3           3           2           3           2           4 
##  41.5512538  44.1459603 
##           3           5
#split df by species
spec.dfs<-split(locs, locs$species)

#init sampling.df which will be a df of samples grouped by unique lat/long
sampling.df<-data.frame(NULL)
for (i in names(spec.dfs)){
  samps<-spec.dfs[[i]] %>% dplyr::group_by(decimallatitude, decimallongitude) %>% dplyr::summarize(count=n())
  df<-cbind(rep(i, times=nrow(samps)), samps)
  sampling.df<-as.data.frame(rbind(sampling.df, df))
}
## `summarise()` regrouping output by 'decimallatitude' (override with `.groups` argument)
## New names:
## * NA -> ...1
## `summarise()` regrouping output by 'decimallatitude' (override with `.groups` argument)
## New names:
## * NA -> ...1
## `summarise()` regrouping output by 'decimallatitude' (override with `.groups` argument)
## New names:
## * NA -> ...1
## `summarise()` regrouping output by 'decimallatitude' (override with `.groups` argument)
## New names:
## * NA -> ...1
## `summarise()` regrouping output by 'decimallatitude' (override with `.groups` argument)
## New names:
## * NA -> ...1
## `summarise()` regrouping output by 'decimallatitude' (override with `.groups` argument)
## New names:
## * NA -> ...1
#fix colnames
colnames(sampling.df)<-c("species","lat","long","count")

#use only currently recognized species
sampling.df$species[13:15]<-c("woodhouseii","woodhouseii","woodhouseii")

#make full map
pac<-map_data("world")
#
ggplot()+
  geom_polygon(data = pac, aes(x=long, y = lat, group = group), fill="grey90", col="black", cex=.1)+
  coord_sf(xlim = c(-123, -80), ylim = c(14, 47)) + 
  geom_point(data = sampling.df, aes(x = long, y = lat, col=species, size=count), alpha =.8, show.legend=TRUE) +
  theme_classic()+
  geom_text(data = sampling.df, aes(x = long, y = lat, label = c(1:10,26,11,25,24,19,23,22,21,20,18,17,14,13,12,16,15)),size = 3)+
  scale_color_manual(values=gg_color_hue(4))+
  scale_size_continuous(range = c(4,8))+
  guides(colour = guide_legend(override.aes = list(size = 4), order=1, label.theme = element_text(face = "italic")),
         size = guide_legend(nrow = 1, order = 2))+
  theme(legend.position = c(0.01, 0.01), legend.justification = c(0.01, 0.01),
        legend.background = element_blank())+
  xlab("longitude")+
  ylab("latitude")

#ggsave("~/Desktop/aph.data/sampling.map.pdf", width=8.5,height=6)
pop(gen)<-gen@ind.names
sample.div <- stamppNeisD(gen, pop = TRUE)
#plot NJ tree
plot(nj(sample.div), type = "unrooted", cex = .65)

#export in phylip format for splitstree
names<-rownames(sample.div)
setwd("~/Downloads")
cat("95\n",file="scrub.splits.txt")
cat("\n",file="scrub.splits.txt", append = T)
for (i in 1:nrow(sample.div)){
  cat(c(names[i],sample.div[i,]),file="scrub.splits.txt", sep=" ",append=TRUE)
  cat("\n",file="scrub.splits.txt",append=TRUE)
}

#all samples splitstree
knitr::include_graphics(c("/Users/devder/Desktop/aph.data/scrub.splits.labeled.by.locality.png"))

#SVDquartets
#steps:
#export an unlinked, filtered vcf file
#convert it to a nexus using this ruby script: https://github.com/mmatschiner/tutorials/blob/master/species_tree_inference_with_snp_data/src/convert_vcf_to_nexus.rb
#ruby convert_vcf_to_nexus.rb unlinked.filtered.recode.vcf unlinked.filtered.nex

#use cat to append a taxablock to the end of the nexus that is formatted like this:
#BEGIN SETS;
#   TAXPARTITION SPECIES =
#       fla: 33-38,
#       isl: 39-46,
#       cal: 1-19 21-28,
#       calbaja: 29-32,
#       intwood: 20 57-73,
#       texas: 74-80,
#       southmex: 81 82 87 90-92 95,
#       northmex: 83-86 88 89 93 94,
#       remota: 49-53,
#       sumi: 47 48 54-56;
#  END;

#cat unlinked.filtered.nex taxablock.txt > unlinked.filtered.taxa.nex

#open the nexus in PAUP* GUI and designate Florida Scrub-Jay samples as outgroup.
#run SVDquartets with 100 bootstrap replicates using species tree assignments
#visualize in figtree

#check out species tree produced by SVDquartets from 2725 unlinked SNPs
knitr::include_graphics("/Users/devder/Desktop/aph.data/svd.quartets.labeled.png")

#Total wieght of incompatible quartets = 73.6396 (35.07%)
#Total wieght of compatible quartets = 136.3557 (64.93%)
#do pairwise Fst between groups to determine the set of species assignments to give delimitr
#investigate how divergent pops are
locs$pop<-locs$number
locs$number<-as.numeric(locs$number)
locs$pop[locs$number >= 3 & locs$number<= 10]<-"ca"
locs$pop[locs$number >= 1 & locs$number<= 2]<-"bajaca"
locs$pop[locs$number >= 12 & locs$number <= 18]<-"intwood"
locs$pop[locs$number == 20 | locs$number == 21]<-"northmex"
locs$pop[locs$number == 22 | locs$number == 23]<-"southmex"

gen@pop<-as.factor(locs$pop)

heat<-stamppFst(gen)
colnames(heat)<-rownames(heat)
h<-heat$Fsts
for (i in 1:10){
  h[1:i,i]<-h[i,1:i]
}
colnames(h)<-c("26", "11", "3-10", "1-2", "12-18", "22-23", "20-21", "19", "25", "24")
rownames(h)<-c("26", "11", "3-10", "1-2", "12-18", "22-23", "20-21", "19", "25", "24")

di.heat <- reshape::melt(h)

ggplot(data = di.heat, aes(x=X1, y=X2, fill=value)) + 
  geom_tile()+
  geom_text(aes(label = round(value, 3))) +
  scale_fill_gradient2(low = "white", high = "red", space = "Lab", name="Fst") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        axis.title.x=element_blank(), axis.title.y=element_blank())
## Warning: Removed 10 rows containing missing values (geom_text).

#combine N mex and S mex
locs$pop[locs$number >=20 & locs$number <= 23]<-"mex"
gen@pop<-as.factor(locs$pop)
heat<-stamppFst(gen)
heat$Fsts
##                26        11         ca    bajaca    intwood       mex        19
## 26             NA        NA         NA        NA         NA        NA        NA
## 11      0.8621540        NA         NA        NA         NA        NA        NA
## ca      0.6253431 0.4975256         NA        NA         NA        NA        NA
## bajaca  0.6742401 0.7231965 0.09644642        NA         NA        NA        NA
## intwood 0.6518919 0.5714119 0.21114618 0.2693528         NA        NA        NA
## mex     0.6227512 0.5516446 0.21000892 0.2467519 0.05104432        NA        NA
## 19      0.7215721 0.7550738 0.32537922 0.4066450 0.19067238 0.1852923        NA
## 25      0.7179034 0.7792230 0.35159860 0.4289907 0.30843020 0.2722541 0.4698566
## 24      0.7370232 0.8041800 0.37599290 0.4650739 0.34364310 0.2983678 0.4988431
##                 25 24
## 26              NA NA
## 11              NA NA
## ca              NA NA
## bajaca          NA NA
## intwood         NA NA
## mex             NA NA
## 19              NA NA
## 25              NA NA
## 24      0.08766167 NA
#combine Mex and interior woodhouse
locs$pop[locs$pop == "intwood"]<-"mex"
gen@pop<-as.factor(locs$pop)
heat<-stamppFst(gen)
heat$Fsts
##               26        11         ca    bajaca       mex        19         25
## 26            NA        NA         NA        NA        NA        NA         NA
## 11     0.8621540        NA         NA        NA        NA        NA         NA
## ca     0.6253431 0.4975256         NA        NA        NA        NA         NA
## bajaca 0.6742401 0.7231965 0.09644642        NA        NA        NA         NA
## mex    0.6217650 0.5095950 0.19929581 0.2444662        NA        NA         NA
## 19     0.7215721 0.7550738 0.32537922 0.4066450 0.1655852        NA         NA
## 25     0.7179034 0.7792230 0.35159860 0.4289907 0.2704870 0.4698566         NA
## 24     0.7370232 0.8041800 0.37599290 0.4650739 0.2993915 0.4988431 0.08766167
##        24
## 26     NA
## 11     NA
## ca     NA
## bajaca NA
## mex    NA
## 19     NA
## 25     NA
## 24     NA
#combine ca and bajaca
locs$pop[locs$pop == "bajaca"]<-"ca"
gen@pop<-as.factor(locs$pop)
heat<-stamppFst(gen)
heat$Fsts
##            26        11        ca       mex        19         25 24
## 26         NA        NA        NA        NA        NA         NA NA
## 11  0.8621540        NA        NA        NA        NA         NA NA
## ca  0.6184269 0.4846951        NA        NA        NA         NA NA
## mex 0.6217650 0.5095950 0.1961811        NA        NA         NA NA
## 19  0.7215721 0.7550738 0.3170649 0.1655852        NA         NA NA
## 25  0.7179034 0.7792230 0.3440784 0.2704870 0.4698566         NA NA
## 24  0.7370232 0.8041800 0.3676615 0.2993915 0.4988431 0.08766167 NA
#combine sumi and remota
locs$pop[locs$pop == "24"]<-"25"
gen@pop<-as.factor(locs$pop)
heat<-stamppFst(gen)
heat$Fsts
##            26        11        ca       mex        19 25
## 26         NA        NA        NA        NA        NA NA
## 11  0.8621540        NA        NA        NA        NA NA
## ca  0.6184269 0.4846951        NA        NA        NA NA
## mex 0.6217650 0.5095950 0.1961811        NA        NA NA
## 19  0.7215721 0.7550738 0.3170649 0.1655852        NA NA
## 25  0.7196134 0.7229394 0.3603120 0.2873610 0.4700515 NA
#combine 19 and mex
locs$pop[locs$pop == "19"]<-"mex"
gen@pop<-as.factor(locs$pop)
heat<-stamppFst(gen)
heat$Fsts
##            26        11        ca      mex 25
## 26         NA        NA        NA       NA NA
## 11  0.8621540        NA        NA       NA NA
## ca  0.6184269 0.4846951        NA       NA NA
## mex 0.6201645 0.5043718 0.2001905       NA NA
## 25  0.7196134 0.7229394 0.3603120 0.290504 NA
#show delimitr species delimitation results on svdquartets tree
#visualize raxml tree
knitr::include_graphics("/Users/devder/Downloads/phylo.meth.fig2.png")

#show BEAST2 tree
knitr::include_graphics("/Users/devder/Desktop/beast2.tmv.ig.png")